package com.nb.crawler.jd;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.nb.crawler.model.Product;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

import com.google.gson.Gson;

public class JDCrawler {
	private String url = "http://module.jshop.jd.com/module/getModuleHtml.html?pagePrototype=8&appId=211751&orderBy=5&pageNo=%s&direction=1&categoryId=0&pageSize=%s&domainKey=newbalance&pageInstanceId=8376117&moduleInstanceId=8376124&prototypeId=68&templateId=401682&layoutInstanceId=8376120&origin=0&shopId=60920&venderId=64877&_=1420962830146";
	private int pageSize = 100;

	public void crawl(int pages) {

		for (int i = 1; i <= pages; i++) {
			String html = getProductListHtml(i);
			parseHtml(html);
		}

	}

	private String getProductListHtml(int page) {
		String finalUrl = String.format(url, page, pageSize);

		HttpClient client = HttpClientBuilder.create()
				.disableRedirectHandling().build();

		try {
			HttpResponse response = client.execute(new HttpGet(finalUrl));

			String json = EntityUtils.toString(response.getEntity());
			System.out.println(json);
			Gson gson = new Gson();
			JDProductListResponse productListResponse = gson.fromJson(json,
					JDProductListResponse.class);

			return productListResponse.getModuleText();
		} catch (IOException e) {
			e.printStackTrace();
		}

		return null;
	}

	private List<Product> parseHtml(String html) {
		List<Product> products = new ArrayList<>();

		HtmlCleaner htmlCleaner = new HtmlCleaner();
		TagNode tagNode = htmlCleaner.clean(html);

		try {
			Object[] items = tagNode.evaluateXPath("//ul/li");

			for (Object obj : items) {
				Product product = new Product();

				TagNode itemNode = (TagNode) obj;

				//标题
				Object[] titleLinks = itemNode.evaluateXPath("//div[@class='jItem']/div[@class='jGoodsInfo']/div[@class='jDesc']/a");
				if (titleLinks != null && titleLinks.length > 0) {
					String title = ((TagNode) titleLinks[0]).getText().toString().trim();
					product.setTitle(title);



				}


				itemNode.evaluateXPath("//");

			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return products;
	}
}
